library(tidyverse)library(here)library(glue)library(copula)library(knitr)library(kableExtra)library(ggsci)library(MASS)library(psych)library(patchwork)library(directlabels)library(progress)library(latex2exp)library(copula)select <- dplyr::selectoptions("scipen"=100, "digits"=6)theme_set(hrbrthemes::theme_ipsum() +theme(legend.position ="top", panel.grid.minor.y =element_blank()) )library(tufte)# invalidate cache when the tufte version changesknitr::opts_chunk$set(cache.extra =packageVersion('tufte'))options(htmltools.dir.version =FALSE)get_distribution_function <-function(x,fn ="pdf") { n.knots =20 dat <-tibble(x = x, cdf=ecdf(x)(x)); head(dat) n <-length(dat$x) fit <- scam::scam(cdf ~s(x, bs ="mpi", k = n.knots), data = dat,weights =c(n, rep(1, n -2), 10* n))## interior knots xk <-with(fit$smooth[[1]], knots[4:(length(knots) -3)])## spline values at interior knots yk <-predict(fit, newdata =data.frame(x = xk))## reparametrization into a monotone interpolation spline xg <-seq(min(dat$x), max(dat$x), length =100) f <- stats::splinefun(xk, yk, "hyman") pcdf <-f(x) ppdf <-f(x,1)if (fn=="pdf") out <-approxfun(x,f(x,1)) if (fn=="cdf") { out <-approxfun(x, f(x), method ="constant",yleft =0,yright =1,f =0) }return(out)}
Outline
Introduction
Decision-Relevant Quantities of Interest
Individual-level treatment effects are of policy interest because they allow us to characterize the distribution of treatment effects, not just some summary (e.g., average) in a population.
What fraction of the population is harmed?
What fraction experiences substantial vs. minimal gain?
What is the average treatment response?
What is the median treatment response? The 25th percentile? The 75th percentile?
The Perfect Data
Credible Identification in the Real World
Even if marginal distributions of the outcome are identified (as they are under random assignment or under unconfounded selection), the “copula” that binds them together is unidentified.
This section will cover identification under the most ideal (realistic) real-world scenario: an experimental design with perfect compliance.
In that case we can identify the marginal distribution of the potential outcomes under treatment and non-treatment.
Expanding the Toolkit via Partial Identification
Copulas and the relationship between marginal and joint distributions.
Brief theory
Simulating from a copula
Copulas and the relationship to treatment effects and decision-relevant quantities of interest.
Worst-Case Bounds
Frechet-Hoeffding
Komogorov
Williamson and Downs
Firpo and Ridder?
Tightening the Bounds with Covariates
Constructing conditional eCDFs
Tightening the Bounds Using Additional Identifying Assumptions (Experimental Designs)
Frandsen and Lefgrens (2021)
Partial Identification with an Exogenous Instrument
Manski IV bounds
Principal Stratification
Frandsen and Lefgrens (2021)
Partial Identification in Observational Settings
Callaway (Panel Data)
Changes-in-Changes
Heckman: Selection
Manski: Monotone Treatment Response
Manski: Monotone Selection Response
Code
df <-# basic perfect doctor data (from Rubin)data.frame(patient = LETTERS[1:10],Y_1 =c(7,5,5,7,4,10,1,5,3,9),Y_0 =c(5,3,8,8,3,1,6,8,8,4)) %>%mutate(Y_0 =c(1,6,1,8,2,1,10,6,7,8)) %>%mutate(delta = Y_1 - Y_0) %>%mutate(D =c(1,1,1,1,1,0,0,0,0,0)) %>%mutate(Y_obs =case_when(D==1~ Y_1, TRUE~ Y_0)) %>%mutate(sim =0) %>%mutate(X =rnorm(10,mean =0, sd =1)) # Simulate additional data from a copula that defines the correlation b/t potential outcomes, and# between a single covariate X and the potential outcomes. m <-3# number of columnsn <-10000-10# sample sizecorXY <-0.8# corelation between potential outcomes and Xsigma <-matrix(c(1,0.6,corXY,0.6,1,corXY,corXY,corXY,1),nrow =3) # correlation matrix# Sample the joint CDF quantiles from a multivariate normal centered at 0set.seed(100)z <-mvrnorm(n,mu=rep(0, m),Sigma=sigma,empirical=T) u <-pnorm(z) # get the quantiles associated with each valueX <-# a single covariate that is predictive of the outcome. qnorm(u[,3],mean =0, sd =1)# Generate the variablesdelta <-#Treatment effect varies, but has mean 2 and SD 1. rnorm(n,mean =2, sd =1)delta <-1+ ((X -mean(X)) / ( sd(X)))sY_0 <-# simulated potential outcome under non-Txqpois(u[,2],lambda=mean(df[df$D==1,]$Y_1)) +rnorm(n=n)sY_1 <-# simulated potential outcome under Txqpois(u[,1],lambda =mean(df[df$D==1,]$Y_1)) + deltadf_sim <-# Construct the final data. tibble(patient =glue("sim{1:n}")) %>%mutate(Y_1 = sY_1,Y_0 = sY_0,X = X) %>%mutate(delta = Y_1 - Y_0) %>%mutate(D =as.integer(runif(nrow(.))<.5)) %>%mutate(U =rnorm(nrow(.))) %>%#mutate(D = as.integer(Y_1>=Y_0)) %>% mutate(sim =1)df <-# Bind it with the general example rows (10 rows) df %>%bind_rows(df_sim) %>%mutate(Y =case_when(D==1~ Y_1, TRUE~ Y_0)) # corUY <- 0.3# # params <-# list(# n = 1e4,# sigma = matrix(c(1,0.8,0.9,corUY,# 0.8,1,0.9,corUY,# 0.9,0.9,1,.2,# corUY,corUY,0.2,1), nrow = 4, ncol = 4, byrow = TRUE,# dimnames = list(c("Y1","Y0","X1","U"),c("Y1","Y0","X1","U"))),# lambda = 15,# delta = 2# )# # m <- ncol(params$sigma)# # # Simulate from a copula# set.seed(123)# z <- # Draw from a multivariate normal distribution with covariance matrix sigma # mvrnorm(params$n,mu=rep(0, m),Sigma=params$sigma,empirical=T) # u <- # obtain the quantiles associated with each value# pnorm(z) # # Y_0 <- # qpois(u[,1], params$lambda) + rnorm(n = params$n, mean = 0, sd = 1) # # U <-# qnorm(u[,4], mean = 0, sd = 1)# # tmp_ <- 1 + (Y_0 - mean(Y_0)) / (1*sd(Y_0))# # tmp_ <- 1 + (U - mean(U)) / (1*sd(U))# # Y_1 <- # Y_0 + rpois(n = params$n, lambda = params$delta) * tmp_ + U# # X1 <- # qnorm(u[,3], mean = 0, sd = 1)# # df <- # tibble(study_id = paste0("ID",1:params$n), Y_0 = Y_0, Y_1 = Y_1, X1 = X1, U = U)# df %>%head(n =10) %>%select(patient, Y_0, Y_1) %>%kable(digits =2) %>%kable_styling()
Figure 1: Scatterplot of Potential Outcomes with 95% Ellipse
Because we can calculate \(Y_i(1)-Y_i(0)\) directly, we can plot its distribution:
Code
df %>%ggplot(aes(x = delta)) +geom_density() +labs(x =TeX("$\\Delta$"), y ="Density")+geom_vline(aes(xintercept =mean(delta)),lty=2,lwd=1.1, col ="darkred") +annotate("text", x =mean(delta), y = .1, hjust=-.1, label ="Avg. Treatment Effect", col ="darkred") +scale_x_continuous(limits =c(min(df$delta),max(df$delta)),breaks =seq(-8,10,2)) +geom_vline(aes(xintercept =0),lty=3)
Figure 2: Distribution of the Treatment Effect (delta)
We can also explore whether the treatment effect magnitude varies by some covariate \(X1\). In Figure 3, we see that the treatment effect tends to be larger for larger values of \(X1\).
Code
# Source: https://stackoverflow.com/questions/61589781/plot-vertical-density-of-normal-distribution-in-r-and-ggplot2taus <-seq(0,1,0.01) quantreg_fit <- quantreg::rq(delta ~ X, tau = taus, data = df)pvals <-predict(quantreg_fit, stepfun=TRUE)X <- df$Xget_delta_dist <-function(qq, scaling =10) { qq %>%map_df(~({ delta_ <-seq(min(df$delta), max(df$delta), length.out =1e4) hatF <- pvals[[which(abs(ecdf(X)(X) - .x) <0.01)[1]]] pdfX <-get_distribution_function(hatF(taus), fn ="pdf") pdf_ <-tibble(y = delta_, x =pdfX(delta_)) %>%na.omit() pdf_norm <-# PDF must integrate to one but sometimes its a bit off. # So we can re-normalize to integrate to one. sum(c(0,diff(delta_))*pdfX(delta_),na.rm=TRUE) mm <-sum(delta_ *c(0,diff(delta_))* (pdfX(delta_)/pdf_norm),na.rm=TRUE) pdf_ %>%mutate(X = X[which(abs(ecdf(X)(X) - .x) <0.01)[1]]) %>%mutate(tau = .x) %>%mutate(mean_delta = mm) })) %>%mutate(max =max(x)) %>%mutate(x = (x))}df_condpdf <-get_delta_dist(c(.01,0.05,0.25,0.5,0.75,0.95)) %>%mutate(x = X + x *10) %>%mutate(X_ = X) %>%mutate(X =factor(X, labels =unique(.$tau))) %>%filter(abs(x-X_)>0.03)df %>%ggplot(aes(x= X, y = delta)) +geom_point(alpha =0.25) +geom_path(data = df_condpdf, aes(x=x,y=y, colour = X),lwd=2) +scale_x_continuous(breaks =round(unique(df_condpdf$X_),2)) +scale_y_continuous(breaks =c(seq(-20,20,10),round(unique(df_condpdf$mean_delta),2)),guide =guide_axis(n.dodge=2)) +scale_color_aaas(name="Quantile of X") +theme(legend.position ="top", panel.grid.minor.y =element_blank()) +labs(y =TeX("$\\Delta$"), y ="Covariate (X) Value") +geom_hline(aes(yintercept =0),lwd=1.25)
Figure 3: Treatment Response by Covariate Value
We can also plot the cumulative distribution function of the treatment response. This is shown in Figure 4.
Code
delta_ <-sort(df %>%pull(delta)) df_ <-data.frame(x = delta_, y =seq_along(delta_)/length(delta_), region = delta_ <0 )br0_ <-unique(sort(c(seq(0,1,0.2),ecdf(delta_)(0))))br0_l <-paste0(round(br0_,2))figdelta <-ggplot(df_, aes(x, y)) +geom_area(aes(fill = region)) +geom_line() +labs(x ="x = Treatment Effect", y =TeX("$Pr(\\Delta$<=x)"))+scale_fill_manual(values =c('lightgrey', '#C14E4295'), guide ="none") +annotate('text', x =0, y =0.08, label ='Harmed', col ="black",size =4, hjust=1.1) +geom_vline(aes(xintercept =2),lty=2,lwd=1.1, col ="darkred") +annotate("text", x =1.5, y =0.8, hjust=1, label ="Avg. Treatment Effect", col ="darkred", size =4) +scale_y_continuous(breaks = br0_, labels = br0_l) +geom_point(data =tibble(x =0, y =ecdf(delta_)(0)), aes(x = x, y = y),size=5, pch=10) figdelta
Figure 4: Cumulative Distribution Function of the Treatment Effect (delta)
Decision-Relevant Quantities of Interest
Table 1 summarizes various quantities of interest we can calculate with full information on the joint distribution of potential outcomes.
Code
tx_effect <- df %>%pull(delta)delta_ <-seq(floor(min(tx_effect)), ceiling(max(tx_effect)),length.out =1e6)avg_delta <-sum(delta_ *c(0,diff(ecdf(tx_effect)(delta_))))frac_harmed <-ecdf(tx_effect)(-1)frac_minimal_harm <-ecdf(tx_effect)(0)-ecdf(tx_effect)(-1)frac_low <-ecdf(tx_effect)(1)-ecdf(tx_effect)(0)frac_high <-1-ecdf(tx_effect)(1)q_delta <-quantile(ecdf(tx_effect),c(0.1,0.5,0.75))quantities <-tibble(average = avg_delta,q10 = q_delta[1],q50 = q_delta[2],q75 = q_delta[3],harm =100* frac_harmed,minimal_harm =100* frac_minimal_harm,low =100*frac_low,high=100*frac_high) %>%gather(quantity,value) %>%mutate(quantity =factor(quantity, levels =c("average","q10","q50","q75","harm","minimal_harm","low","high"),labels =c("Average Treatment Response","25th %ile of Tx Response","50th %ile of Tx Response","75th %ile of Tx Reponse","Percent with Substantial Harm","Percent with Minimal Harm","Percent with Low Benefit","Percent with High Benefit"))) %>%mutate(formula =c("$\\int t d \\hat F_{\\Delta}(t)$","$\\hat F^{-1}_{\\Delta}(0.25)$","$\\hat F^{-1}_{\\Delta}(0.50)$","$\\hat F^{-1}_{\\Delta}(0.75)$","$\\hat F_{\\Delta}(-1)$","$\\hat F_{\\Delta}(0)-\\hat F_{\\Delta}(-1)$","$\\hat F_{\\Delta}(1)-\\hat F_{\\Delta}(0)$","$1 - \\hat F_{\\Delta}(1)$"))quantities %>%select(quantity,formula,value) %>%kable(booktabs = T ,escape = F, align ='l', format ="markdown",col.names =c("Quantity of Interest","Estimator","Value"), digits =2) %>%kable_classic(full_width = F,position ="center",font_size =35)
Table 1: Quantities of Interest
Quantity of Interest
Estimator
Value
Average Treatment Response
\(\int t d \hat F_{\Delta}(t)\)
0.99
25th %ile of Tx Response
\(\hat F^{-1}_{\Delta}(0.25)\)
-2.27
50th %ile of Tx Response
\(\hat F^{-1}_{\Delta}(0.50)\)
0.93
75th %ile of Tx Reponse
\(\hat F^{-1}_{\Delta}(0.75)\)
2.68
Percent with Substantial Harm
\(\hat F_{\Delta}(-1)\)
22.46
Percent with Minimal Harm
\(\hat F_{\Delta}(0)-\hat F_{\Delta}(-1)\)
13.12
Percent with Low Benefit
\(\hat F_{\Delta}(1)-\hat F_{\Delta}(0)\)
15.61
Percent with High Benefit
\(1 - \hat F_{\Delta}(1)\)
48.81
What Can We Point Identify in an Experimental Study?
We can plot the empirical CDF of the treated and control groups. Notice in Figure 5 that the curves cross. This provides suggestive evidence that there is heterogeneity in treatment effects.
If \(F_{Y_1|D=1}\) and \(F_{Y_0|D=1}\) are identified, we can use the marginal distribution of the outcome among the treated and control units to calculate the Average Treatment Effect on the Treated (ATT) and the Quantile Treatment Effect on the Treated (QTT):
The ATT is given by:
\[
ATT=E\left[Y_{1}-Y_{0} \mid D=1\right]
\]
And the QTT is given by:
\[
Q T T(\tau)=F_{Y_{1} \mid D=1}^{-1}(\tau)-F_{Y_{0} \mid D=1}^{-1}(\tau)
\]
Figure 6: Quantile Treatment Effect on the Treated
We can estimate the ATT and QTT using linear regression (e.g., OLS) and quantile regression, respectively. Figure 7 overlays the estimate ATT (red) and QTT estimates (dark blue) based on OLS and quantile regression fit to the experimental data generated above.
Figure 7: Average and Quantile Treatment Effect on the Treated
Can We Partially Identify Other Quantities of Interest?
Simulating from a Copula
######################################## Option 1: Cholesky Decomposition####################################### Source: Coping with Copulas Sec 5.1# 1. Define the correlation matrix rho <-0.6 sigma <-matrix(c(1,rho,rho,1),nrow =2, byrow=TRUE)# 2. Perform a cholesky decomposition A =chol(sigma)# 3. Generate iid standard normal pseudo random variables tildeY0 <-rnorm(n =1e6, mean =0, sd =1) tildeY1 <-rnorm(n =1e6, mean =0, sd =1) tildeY <-cbind(tildeY0,tildeY1)# Collect them tildeY <- tildeY %*% A# Use the standard normal disribution function to return the quantiles U <-pnorm(tildeY)# Use the inverse distribution function to return the values Y <- U Y[,1] <-qpois(U[,1],lambda =10) Y[,2] <-qpois(U[,2],lambda =12)# Confirm the correct correlation cor(Y[,1],Y[,2])
############################ Option 2: Use MVRNORM##########################U_ <-pnorm(mvrnorm(n =1e6, Sigma = sigma, mu =c(0,0)))Y_ <- U_Y_[,1] <-qpois(U_[,1],lambda =10)Y_[,2] <-qpois(U_[,2],lambda =12)cor(Y_[,1],Y_[,2])
Sklar (1959) demonstrates that the joint distribution can be represented as
\[
\mathrm{F}_{Y_{1}, Y_{0} \mid D=1}\left(y_1, y_0\right)=C_{Y_{1}, Y_{0} \mid D=1}\left(\mathrm{~F}_{Y_{1} \mid D=1}\left(y_1\right), \mathrm{F}_{Y_{0} \mid D=1}\left(y_0\right)\right)
\] The copulas are given by \[
M(y_1,y_0)=\min \left\{y_1,y_0\right\}
\] for extreme positive dependence and
\[
W\left(y_1, y_0\right)=\max \left\{y_1+y_0-1,0\right\}
\] for perfect negative dependence.
We can use this to form bounds on the joint distribution (\(H(y_1, y_2)\)) based on the marginals:
Frechet-Hoeffding lower bounds corresponds to case with perfect anticorrelation between potential outcomes.
Upper bounds corresponds to case with perfect correlation between potential outcomes.
They are distributions of perfectly negatively dependent and perfectly positively dependent random variables respectively, see Joe (1997) and Nelsen (1999) for more discussions.
Sharp bounds on the joint distribution of the potential outcomes with identiÖed marginals are given by the Frechet-Hoe§ding lower and upper bound distributions, see Heckman and Smith (1993), Heckman, Smith, and Clements (1997), and Manski (1997b) for their applications in program evaluation. For randomized experiments, Heckman, Smith, and Clements (1997) proposed nonparametric estimates of the FrÈchet-Hoe§ding distribution bounds and developed a test for the ìcommon e§ectî model by testing the lower bound of the variance of the treatment e§ect. They also suggested an alternative test based on the di§erence between the quantile functions of the marginal distributions of the potential outcomes referred to as the quantile treatment e§ect (QTE), see Firpo (2005) or Section 2 for more references. Sharp bounds on the distribution of the treatment e§ectó the di§erence between two potential outcomes with identiÖed marginalsó are known in the probability literature. A.N. Kolmogorov posed the question of Önding sharp bounds on the distribution of a sum of two random variables with Öxed marginal distributions. It was Örst solved by Makarov (1981) and later by R¸schendorf (1982) and Frank, Nelsen, and Schweizer (1987) using di§erent techniques. Frank, Nelsen, and Schweizer (1987) showed that their proof based on copulas can be extended to more general functions than the sum. Sharp bounds on the respective distributions of a di§erence, a product, and a quotient of two random variables with Öxed marginals can be found in Williamson and Downs (1990). More recently, Denuit, Genest, and Marceau (1999) extended the bounds for the sum to arbitrary dimensions and provided some applications in Önance and risk management, see Embrechts, Hoeing, and Juri (2003) and McNeil, Frey, and Embrechts (2005) for more discussions and additional references.
Figure 10: Worst-Case Treatment Effect Distribution Bounds
Frandsen and Lefgrens
We’ll next construct bounds based on Frandsen and Lefgren (2021).
Bound Distributions for a Single Y Value
Our first objective is to simply plot the PDF of the untreated group. We’ll obtain this by differentiating the empirical CDF and fitting a smooth function to it.
gen_pdf <-function(F,x, n.knots =20) { dat <-tibble(x, cdf=F(x)) n <-length(dat$x) fit <- scam::scam(cdf ~s(x, bs ="mpi", k = n.knots), data = dat,weights =c(n, rep(1, n -2), 10* n))## interior knots xk <-with(fit$smooth[[1]], knots[4:(length(knots) -3)])## spline values at interior knots yk <-predict(fit, newdata =data.frame(x = xk))## reparametrization into a monotone interpolation spline xg <-seq(min(dat$x), max(dat$x), length =100) f <- stats::splinefun(xk, yk, "hyman") dat$pdens =f(x,1) dat <- dat %>%mutate(dt =c(0,diff(x,1)),dy =c(0,diff(cdf,1))) %>%mutate(dydt = dy/dt)return(dat)}Y0 <-4y0_ <- df %>%filter(D==0) %>%pull(Y) y1_ <- df %>%filter(D==1) %>%pull(Y) df_pdf0 <-gen_pdf(F = F0,x =sort(c(Y0,y0_))) %>%na.omit()df_pdf1 <-gen_pdf(F = F1,x =sort(c(Y0,y1_))) %>%na.omit()
We next need to find the y value in the treated distribution that maps to the quantile value in the untreated distribution at -1.
Let’s now define conditional PDFs for the best and worst case
df_condpdf_worst <- df_pdf1 %>%mutate(pdens =ifelse(x<=yy,pdens,0)) %>%# Normalize so the PDF integrates to one.mutate(pdens = (pdens) /sum(pdens * dt))df_condpdf_best <- df_pdf1 %>%mutate(pdens =ifelse(x>yy,pdens,0)) %>%# Normalize so the PDF integrates to one.mutate(pdens = (pdens) /sum(pdens * dt))f2A <- df_condpdf_worst %>%ggplot(aes(x = x, y = pdens)) +geom_point() #+#scale_y_continuous(limits = c(0,0.5))f2B <- df_condpdf_best %>%ggplot(aes(x = x, y = pdens)) +geom_point() #+#scale_y_continuous(limits = c(0,0.5))
Empirical Density for Worst and Best Case
The rightmost panel of the figure shows that an individual with the given Y value has zero probability of drawing a \(Y(1)\)below his or her rank in the control state. Instead, her draws is from the truncated distribution of \(Y(1)\) ranks above her rank in the control state. As noted in the paper, “the best case corresponds to the lower-bound CDF given in equation (1).” scratch ends here
Figure 11: Frandsen-Lefgren (2021) Bounds on the Cumulative Distribution Function of the Treatment Effect (delta)
Figure 12: Worst-Case Bounds on the Average Treatment Effect on the Treated, by Method
Figure 16: Fransden-Lefgren Treatment Effect Distribution Bounds With Covariates
References
Frandsen, Brigham R., and Lars J. Lefgren. 2021. “Partial Identification of the Distribution of Treatment Effects with an Application to the Knowledge Is Power Program (KIPP).”Quantitative Economics 12 (1): 143–71.